import urllib.request
import os
url="http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
filepath="/Users/hannah/titanic3.xls"
if not os.path.isfile(filepath):
result=urllib.request.urlretrieve(url,filepath)
print('downloaded:',result)
output:
downloaded: ('/Users/hannah/titanic3.xls', <http.client.HTTPMessage object at 0x10a450400>)
import numpy
import pandas as pd
all_df=pd.read_excel(filepath)
all_df[:2]
output:
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
0 1 1 Allen, Miss. Elisabeth Walton female 29.0000 0 0 24160 211.3375 B5 S 2 NaN St Louis, MO
1 1 1 Allison, Master. Hudson Trevor male 0.9167 1 2 113781 151.5500 C22 C26 S 11 NaN Montreal, PQ / Chesterville, ON
cols=['survived','name','pclass','sex','age','sibsp','parch','fare','embarked']
all_df=all_df[cols]
all_df[:2]
output:
survived name pclass sex age sibsp parch fare embarked
0 1 Allen, Miss. Elisabeth Walton 1 female 29.0000 0 0 211.3375 S
1 1 Allison, Master. Hudson Trevor 1 male 0.9167 1 2 151.5500 S
df=all_df.drop(['name'],axis=1)
all_df.isnull().sum()
survived 0
name 0
pclass 0
sex 0
age 263
sibsp 0
parch 0
fare 1
embarked 2
dtype: int64
age_mean=df['age'].mean()
df['age']=df['age'].fillna(age_mean)
age_mean=df['fare'].mean()
df['fare']=df['fare'].fillna(age_mean)
df['sex']=df['sex'].map({'female':0,'male':1}).astype(int)
x_OneHot_df=pd.get_dummies(data=df,columns=["embarked"])
x_OneHot_df[:2]
output:
survived pclass sex age sibsp parch fare embarked_C embarked_Q embarked_S
0 1 1 0 29.0000 0 0 211.3375 0 0 1
1 1 1 1 0.9167 1 2 151.5500 0 0 1
ndarray=x_OneHot_df.values
ndarray.shape
output:
(1309, 10)
ndarray[:2]
output:
array([[ 1. , 1. , 0. , 29. , 0. , 0. ,
211.3375, 0. , 0. , 1. ],
[ 1. , 1. , 1. , 0.9167, 1. , 2. ,
151.55 , 0. , 0. , 1. ]])
Label=ndarray[:0]
Features=ndarray[:,1:]
Label[:2]
output:
array([], shape=(0, 10), dtype=float64)